import os
import json
import gzip
from csv  import writer
import csv
import pandas as pd
import numpy as np
import subprocess as sub
import re
from ast import literal_eval
from scipy.spatial import cKDTree
from math import isnan
import sys
import multiprocessing
import tqdm

in_path = ''
out_path = ''

thefiles = os.listdir(in_path)
thefiles.sort()

print('\nNumber of files: ' + str(len(thefiles)) + '\n')

os.chdir(out_path)

def get_tweets(the_instance):

    sub_sample_size = round(len(thefiles) / n_processes)

    sample_beginning = the_instance * sub_sample_size
    sample_end = ((the_instance+1) * sub_sample_size)-1
    sub_thefiles = thefiles[sample_beginning:sample_end]

    j = 0
    print('Getting tweet geos..\n')
    for fn in sub_thefiles:
        j+=1
        if j%24 == 1:
            print('  ' + str(fn) + '')
            #
        nexceptions = 0
        if os.path.isfile(os.path.abspath(in_path + fn)):
            tweets = []
            for line in gzip.open(os.path.abspath(in_path + fn), 'r'):
                try:
                    tweets.append(json.loads(line))
                except Exception as e:
                    nexceptions+=1
                    # pass
            #
            #
        print('      # of read line exceptions: ' + str(nexceptions) + ' (hour ' + fn[11:13] + ')')
        #
        ids = [tweet['user']['id_str'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
        tweet_ids = [tweet['id_str'] if 'id_str' in tweet  else np.nan for tweet in tweets]
        source = [tweet['source']  if 'source' in tweet  else np.nan for tweet in tweets]
        coordinates = [tweet['coordinates']  if 'coordinates' in tweet  else np.nan for tweet in tweets]
        place = [tweet['place']  if 'place' in tweet  else np.nan for tweet in tweets]
        text = [tweet['text']  if 'text' in tweet  else np.nan for tweet in tweets]
        user_lang = [tweet['user']['lang'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
        lang = [tweet['lang'] if 'lang' in tweet.keys() else np.nan for tweet in tweets]
        #
        DF = pd.DataFrame(
            {'id': ids, 'source': source, 'tweet_id': tweet_ids,
             'coordinates':coordinates,'place':place,'text':text,
             'user_lang':user_lang,'lang':lang
            })
        #
        DF[['id','tweet_id','source','coordinates','place','text','user_lang','lang']].to_csv(
            os.path.abspath(out_path + fn.replace('.txt.gz', '') + '_tweet_source_with_coords_and_text.txt'),
            sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC
        )

if __name__ == '__main__':
    n_processes = 5 # args.n_processes
    which_process = list(range(0, n_processes))
    with multiprocessing.Pool(processes=n_processes) as pool:
        pool.map(get_tweets, which_process)
